test <- read.csv("Admission_Predict_Ver1.1.csv")
summary (test)
## Serial.No. GRE.Score TOEFL.Score University.Rating
## Min. : 1.0 Min. :290.0 Min. : 92.0 Min. :1.000
## 1st Qu.:125.8 1st Qu.:308.0 1st Qu.:103.0 1st Qu.:2.000
## Median :250.5 Median :317.0 Median :107.0 Median :3.000
## Mean :250.5 Mean :316.5 Mean :107.2 Mean :3.114
## 3rd Qu.:375.2 3rd Qu.:325.0 3rd Qu.:112.0 3rd Qu.:4.000
## Max. :500.0 Max. :340.0 Max. :120.0 Max. :5.000
## SOP LOR CGPA Research
## Min. :1.000 Min. :1.000 Min. :6.800 Min. :0.00
## 1st Qu.:2.500 1st Qu.:3.000 1st Qu.:8.127 1st Qu.:0.00
## Median :3.500 Median :3.500 Median :8.560 Median :1.00
## Mean :3.374 Mean :3.484 Mean :8.576 Mean :0.56
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:9.040 3rd Qu.:1.00
## Max. :5.000 Max. :5.000 Max. :9.920 Max. :1.00
## Chance.of.Admit
## Min. :0.3400
## 1st Qu.:0.6300
## Median :0.7200
## Mean :0.7217
## 3rd Qu.:0.8200
## Max. :0.9700
head(test)
## Serial.No. GRE.Score TOEFL.Score University.Rating SOP LOR CGPA Research
## 1 1 337 118 4 4.5 4.5 9.65 1
## 2 2 324 107 4 4.0 4.5 8.87 1
## 3 3 316 104 3 3.0 3.5 8.00 1
## 4 4 322 110 3 3.5 2.5 8.67 1
## 5 5 314 103 2 2.0 3.0 8.21 0
## 6 6 330 115 5 4.5 3.0 9.34 1
## Chance.of.Admit
## 1 0.92
## 2 0.76
## 3 0.72
## 4 0.80
## 5 0.65
## 6 0.90
attach(test)
#Linear Regression and some plots
#Here's a linear model (Chance of Admit)
linear <- lm(Chance.of.Admit ~., data=test)
#summary(linear)
plot(linear)
#Here's a linear model (University Rating)
linear <- lm(University.Rating ~., data=test)
#summary(linear)
plot(linear)
logmod <- glm(Research~., data=test)
#summary(logmod)
plot(logmod)
chance.vs.CGPA <- lm(test$Chance.of.Admit ~ test$CGPA)
plot(test$Chance.of.Admit ~ test$CGPA, xlab = "Chance of Admission", ylab = "CGPA", main = "Chance of Admission VS CGPA")
abline(chance.vs.CGPA , col="red", lwd=3, data = test)
## Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "data"
## is not a graphical parameter
#sum ((predict(chance.vs.CGPA, data.frame(test))) - test$Chance.of.Admit)^2 /nrow(test)
By performing backwards selection, we will remove the least significant values until all values are significant.
linear <- lm(Chance.of.Admit~ ., data = test )
#summary(linear)
#Remove University Ranking because it has the highest non significant p value
linear <- lm(Chance.of.Admit~ GRE.Score + TOEFL.Score + SOP +LOR + CGPA + Research , data = test )
#summary(linear)
#Remove SOP has the second highest non significant p value
linear <- lm(Chance.of.Admit~ GRE.Score + TOEFL.Score +LOR + CGPA + Research , data = test )
#Remove Serial No.
linear <- lm(Chance.of.Admit~ GRE.Score + TOEFL.Score +LOR + CGPA + Research , data = test )
#summary(linear)
#plot(test)
#linearPlot <- plot(Chance.of.Admit~.,data=test)
linear <- lm(Research~ Serial.No. + GRE.Score + TOEFL.Score + University.Rating + SOP +LOR + CGPA, data = test )
#summary(linear)
#Remove SOP
linear <- lm(Research~ Serial.No. + GRE.Score + TOEFL.Score + University.Rating +LOR + CGPA, data = test )
#summary(linear)
#Remove SOP, CGPA
linear <- lm(Research~ Serial.No. + GRE.Score + TOEFL.Score + University.Rating +LOR, data = test )
#summary(linear)
#Remove SOP, CGPA, LOR
linear <- lm(Research~ Serial.No. + GRE.Score + TOEFL.Score + University.Rating, data = test )
#summary(linear)
#Remove SOP, CGPA, LOR, TOEFL
linear <- lm(Research~ Serial.No. + GRE.Score + University.Rating, data = test )
#summary(linear)
#Remove SOP, CGPA, LOR, TOEFL, Serial Number
linear <- lm(Research~ + GRE.Score + University.Rating, data = test )
summary(linear)
##
## Call:
## lm(formula = Research ~ +GRE.Score + University.Rating, data = test)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.14033 -0.35017 0.00906 0.29255 1.00181
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.415603 0.625451 -10.258 <2e-16 ***
## GRE.Score 0.021546 0.002099 10.266 <2e-16 ***
## University.Rating 0.050337 0.020731 2.428 0.0155 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4089 on 497 degrees of freedom
## Multiple R-squared: 0.3254, Adjusted R-squared: 0.3227
## F-statistic: 119.9 on 2 and 497 DF, p-value: < 2.2e-16
plot(linear)
##based on the Normal Q-Q Plot, we can determine that the the data fits well
linear <- lm(University.Rating~ Serial.No. + GRE.Score + TOEFL.Score + SOP +LOR + CGPA + Research, data = test )
summary(linear)
##
## Call:
## lm(formula = University.Rating ~ Serial.No. + GRE.Score + TOEFL.Score +
## SOP + LOR + CGPA + Research, data = test)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.34352 -0.46556 -0.03557 0.44046 2.44809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.4170319 1.2125399 -5.292 1.82e-07 ***
## Serial.No. 0.0001812 0.0002260 0.802 0.42307
## GRE.Score 0.0065910 0.0059476 1.108 0.26833
## TOEFL.Score 0.0209679 0.0103520 2.025 0.04336 *
## SOP 0.4474027 0.0507642 8.813 < 2e-16 ***
## LOR 0.1498125 0.0488318 3.068 0.00227 **
## CGPA 0.3578395 0.1141124 3.136 0.00182 **
## Research 0.0923371 0.0783086 1.179 0.23891
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7109 on 492 degrees of freedom
## Multiple R-squared: 0.619, Adjusted R-squared: 0.6135
## F-statistic: 114.2 on 7 and 492 DF, p-value: < 2.2e-16
#Remove Serial Number
linear <- lm(University.Rating~ GRE.Score + TOEFL.Score + SOP +LOR + CGPA + Research, data = test )
summary(linear)
##
## Call:
## lm(formula = University.Rating ~ GRE.Score + TOEFL.Score + SOP +
## LOR + CGPA + Research, data = test)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.36251 -0.47140 -0.04223 0.45376 2.41297
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.295220 1.202548 -5.235 2.45e-07 ***
## GRE.Score 0.006468 0.005943 1.088 0.27705
## TOEFL.Score 0.020128 0.010295 1.955 0.05114 .
## SOP 0.441757 0.050255 8.790 < 2e-16 ***
## LOR 0.154072 0.048524 3.175 0.00159 **
## CGPA 0.364222 0.113793 3.201 0.00146 **
## Research 0.096184 0.078133 1.231 0.21890
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7106 on 493 degrees of freedom
## Multiple R-squared: 0.6185, Adjusted R-squared: 0.6138
## F-statistic: 133.2 on 6 and 493 DF, p-value: < 2.2e-16
#Remove GRE
linear <- lm(University.Rating~ TOEFL.Score + SOP +LOR + CGPA + Research, data = test )
summary(linear)
##
## Call:
## lm(formula = University.Rating ~ TOEFL.Score + SOP + LOR + CGPA +
## Research, data = test)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.37560 -0.47448 -0.03629 0.45065 2.41676
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.243653 0.715856 -7.325 9.79e-13 ***
## TOEFL.Score 0.025353 0.009109 2.783 0.00559 **
## SOP 0.440906 0.050259 8.773 < 2e-16 ***
## LOR 0.151540 0.048478 3.126 0.00188 **
## CGPA 0.414718 0.103920 3.991 7.59e-05 ***
## Research 0.120784 0.074805 1.615 0.10702
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7107 on 494 degrees of freedom
## Multiple R-squared: 0.6176, Adjusted R-squared: 0.6137
## F-statistic: 159.5 on 5 and 494 DF, p-value: < 2.2e-16
#Remove Research
linear <- lm(University.Rating~ TOEFL.Score + SOP +LOR + CGPA, data = test )
summary(linear)
##
## Call:
## lm(formula = University.Rating ~ TOEFL.Score + SOP + LOR + CGPA,
## data = test)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.46231 -0.46269 -0.04935 0.45262 2.39211
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.62010 0.67792 -8.290 1.07e-15 ***
## TOEFL.Score 0.02695 0.00907 2.971 0.00311 **
## SOP 0.44423 0.05030 8.832 < 2e-16 ***
## LOR 0.15563 0.04849 3.210 0.00142 **
## CGPA 0.44360 0.10254 4.326 1.83e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7119 on 495 degrees of freedom
## Multiple R-squared: 0.6155, Adjusted R-squared: 0.6124
## F-statistic: 198.1 on 4 and 495 DF, p-value: < 2.2e-16
logmod <- glm(Research~., data=test)
summary(logmod)
##
## Call:
## glm(formula = Research ~ ., data = test)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.01724 -0.33223 0.00753 0.29143 0.99776
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.738e+00 7.931e-01 -5.974 4.44e-09 ***
## Serial.No. 7.426e-05 1.313e-04 0.565 0.572113
## GRE.Score 1.921e-02 3.327e-03 5.776 1.36e-08 ***
## TOEFL.Score -8.741e-03 5.980e-03 -1.462 0.144417
## University.Rating 2.412e-02 2.566e-02 0.940 0.347657
## SOP 1.441e-02 3.108e-02 0.464 0.643147
## LOR 1.404e-02 2.840e-02 0.494 0.621210
## CGPA -9.398e-02 7.457e-02 -1.260 0.208213
## Chance.of.Admit 1.065e+00 3.066e-01 3.474 0.000557 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1633439)
##
## Null deviance: 123.200 on 499 degrees of freedom
## Residual deviance: 80.202 on 491 degrees of freedom
## AIC: 523.91
##
## Number of Fisher Scoring iterations: 2
plot(logmod)
#Removed LOR
logmod <- glm(Research~Serial.No. + GRE.Score + TOEFL.Score + University.Rating + SOP + CGPA, data=test)
summary(logmod)
##
## Call:
## glm(formula = Research ~ Serial.No. + GRE.Score + TOEFL.Score +
## University.Rating + SOP + CGPA, data = test)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.09472 -0.33171 0.01616 0.28395 1.02222
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.3439527 0.6572538 -9.652 < 2e-16 ***
## Serial.No. 0.0001857 0.0001291 1.438 0.151
## GRE.Score 0.0216466 0.0032778 6.604 1.04e-10 ***
## TOEFL.Score -0.0054357 0.0059720 -0.910 0.363
## University.Rating 0.0344431 0.0256319 1.344 0.180
## SOP 0.0302826 0.0298521 1.014 0.311
## CGPA 0.0443213 0.0648898 0.683 0.495
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1670782)
##
## Null deviance: 123.20 on 499 degrees of freedom
## Residual deviance: 82.37 on 493 degrees of freedom
## AIC: 533.24
##
## Number of Fisher Scoring iterations: 2
#plot(logmod)
#Removed LOR, CGPA
logmod <- glm(Research~Serial.No. + GRE.Score + TOEFL.Score + University.Rating + SOP , data=test)
summary(logmod)
##
## Call:
## glm(formula = Research ~ Serial.No. + GRE.Score + TOEFL.Score +
## University.Rating + SOP, data = test)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.09070 -0.33673 0.01374 0.28546 1.03378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.4208255 0.6471960 -9.921 < 2e-16 ***
## Serial.No. 0.0001936 0.0001285 1.506 0.133
## GRE.Score 0.0225907 0.0029705 7.605 1.45e-13 ***
## TOEFL.Score -0.0042408 0.0057070 -0.743 0.458
## University.Rating 0.0375039 0.0252235 1.487 0.138
## SOP 0.0358048 0.0287209 1.247 0.213
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1668978)
##
## Null deviance: 123.200 on 499 degrees of freedom
## Residual deviance: 82.448 on 494 degrees of freedom
## AIC: 531.72
##
## Number of Fisher Scoring iterations: 2
#Removed LOR, CGPA, TOEFL
logmod <- glm(Research~Serial.No. + GRE.Score + University.Rating + SOP , data=test)
summary(logmod)
##
## Call:
## glm(formula = Research ~ Serial.No. + GRE.Score + University.Rating +
## SOP, data = test)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.09592 -0.34393 0.00147 0.29124 1.03427
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.3837643 0.6449795 -9.898 <2e-16 ***
## Serial.No. 0.0002017 0.0001280 1.575 0.116
## GRE.Score 0.0210992 0.0021887 9.640 <2e-16 ***
## University.Rating 0.0346801 0.0249243 1.391 0.165
## SOP 0.0319968 0.0282472 1.133 0.258
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1667468)
##
## Null deviance: 123.20 on 499 degrees of freedom
## Residual deviance: 82.54 on 495 degrees of freedom
## AIC: 530.27
##
## Number of Fisher Scoring iterations: 2
#Removed LOR, CGPA, TOEFL, SOP
logmod <- glm(Research~Serial.No. + GRE.Score + University.Rating , data=test)
summary(logmod)
##
## Call:
## glm(formula = Research ~ Serial.No. + GRE.Score + University.Rating,
## data = test)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.10835 -0.34957 0.00049 0.28952 1.02269
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.5389338 0.6304444 -10.372 <2e-16 ***
## Serial.No. 0.0001855 0.0001272 1.458 0.1455
## GRE.Score 0.0217887 0.0021030 10.361 <2e-16 ***
## University.Rating 0.0504027 0.0207077 2.434 0.0153 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.166842)
##
## Null deviance: 123.200 on 499 degrees of freedom
## Residual deviance: 82.754 on 496 degrees of freedom
## AIC: 529.57
##
## Number of Fisher Scoring iterations: 2
#Removed LOR, CGPA, TOEFL, SOP, Serial Number
logmod <- glm(Research~ GRE.Score + University.Rating , data=test)
summary(logmod)
##
## Call:
## glm(formula = Research ~ GRE.Score + University.Rating, data = test)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.14033 -0.35017 0.00906 0.29255 1.00181
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.415603 0.625451 -10.258 <2e-16 ***
## GRE.Score 0.021546 0.002099 10.266 <2e-16 ***
## University.Rating 0.050337 0.020731 2.428 0.0155 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1672199)
##
## Null deviance: 123.200 on 499 degrees of freedom
## Residual deviance: 83.108 on 497 degrees of freedom
## AIC: 529.71
##
## Number of Fisher Scoring iterations: 2
plot(logmod)
#based on the logMod summary, the 2 most signifant variables are University Rating and GRE.Score. Based on these results, as the University Rating increases, so does then number of students who do conduct Research.
set.seed(7861)
cvlm <- list()
msecv <- NA
for(i in 1:nrow(test)){
#Fit the linear model
cvlm[[i]] <- lm(Chance.of.Admit[-i] ~ CGPA[-i])
# Calculate MSE for ith model
msecv[i] <- (predict(cvlm[[i]], newdata = data.frame(CGPA[-i]))-Chance.of.Admit[i])^2
#msecv[i]
}
#output mean of MSE
mean(msecv)
## [1] 0.06879746
set.seed(7861)
cvlm <- list()
msecv <- NA
for(i in 1:nrow(test)){
#Fit the linear model
cvlm[[i]] <- lm(Chance.of.Admit[-i] ~ Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i])
# Calculate MSE for ith model
msecv[i] <- (predict(cvlm[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
#msecv[i]
}
#output mean of MSE
mean(msecv)
## [1] 0.0666215
ResearchData <- test$Research
ResearchDataFactor <- factor(test$Research)
simlog<-glm(factor(Research)~., family = "binomial", data = test)
table(predict(simlog, type = "response")>0.5, ResearchData)
## ResearchData
## 0 1
## FALSE 154 57
## TRUE 66 223
misclassificationRate <- (57+66)/(154+223)
capture.output(cat('Misclassification rate = ', misclassificationRate))
## [1] "Misclassification rate = 0.3262599"
library(MLmetrics)
##
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
##
## Recall
F1<- F1_Score(as.numeric(predict(simlog, type = "response")>0.5), ResearchData)
Accu <- Accuracy(as.numeric(predict(simlog, type = "response")>0.5), ResearchData)
Sens <- Sensitivity(as.numeric(predict(simlog, type = "response")>0.5), ResearchData)
scoreTable <-cbind(F1, Accu, Sens)
colnames(scoreTable)<-c("F1 Score", "Accuracy", "Sensitivity")
rownames(scoreTable)<-c("Logistic Regression")
#rownames(scoreTable)<-c("Logistic Regression", "Neural Network")
round(scoreTable,3)
## F1 Score Accuracy Sensitivity
## Logistic Regression 0.715 0.754 0.73
set.seed(7861)
cvlm <- list()
msecv <- NA
for(i in 1:nrow(test)){
#Fit the linear model
cvlm[[i]] <- lm(University.Rating[-i] ~ TOEFL.Score[-i] + SOP[-i] + LOR[-1] + CGPA[-i])
# Calculate MSE for ith model
msecv[i] <- (predict(cvlm[[i]], newdata = data.frame(TOEFL.Score[-i] + SOP[-i] + LOR[-1] + CGPA[-i]))-University.Rating[i])^2
#msecv[i]
}
#output mean of MSE
mean(msecv)
## [1] 3.373402